Normalize the St. Louis Data Set


In [17]:
import json
import pandas as pd
import datetime
import numpy as np

In [18]:
df = pd.read_csv('missouri.csv')


/home/mdboom/.pyenv/versions/miniconda3-3.16.0/lib/python3.4/site-packages/IPython/core/interactiveshell.py:2902: DtypeWarning: Columns (4,5,6,8,11,14,18,19,23,25,26,28,32) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

In [19]:
df.columns


Out[19]:
Index(['Unnamed: 0', ':created_at', ':id', ':updated_at', 'applicant_name',
       'applicant_representative', 'class_description', 'council_district',
       'county', 'day', 'dwelling_units_gained_or_lost', 'fraction',
       'how_far_east', 'how_far_north', 'legal_description', 'month', 'name',
       'number', 'owner_name', 'parcel', 'permit_number', 'permit_value',
       'pin', 'plan_area', 'prefix', 'project_description',
       'property_description', 'res_non', 'school_district', 'sf_mf',
       'structure_class', 'suffix', 'suite', 'type', 'year'],
      dtype='object')

In [20]:
len(df)


Out[20]:
290573

In [21]:
# Remove everything after 2008 sinee the data changes there
df = df[df['year'] < 2008]

In [22]:
len(df)


Out[22]:
167845

In [23]:
# These are at least theoretically helpful columns
useful_columns = [
    'day', 'dwelling_units_gained_or_lost',
    'month', 'permit_value', 
    'res_non', 'sf_mf', 'structure_class', 'year', 'type']
df = df[useful_columns]

In [24]:
df['target'] = (df['type'] == 'New Construction')

In [25]:
df.to_csv('missouri-normal.csv')

In [ ]: